In [ ]:

    
%load_ext autoreload
%autoreload 2
import numpy as np
import os
import sys
import cPickle









    



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Define a char set:



In [17]:

    
chars = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',
                 '0','1','2','3','4','5','6','7','8','9',
                 ' ',',','.',':',';',"'",'!','?','$','%','&','(',')','=','+','-','<EOS>']

chars_to_idx = {}
index = 0
for c in chars : 
    chars_to_idx[c] = index
    index += 1
    
idx_to_chars = {}
for k,i in chars_to_idx.items():
    idx_to_chars[i] = k



In [1]:

    
def stringToOneHot(s, chars_to_idx, lower=True):
    if lower:
        s = s.lower()
        
    # Add an UNKNOWN char
    # Add the <EOS> at the end
    v_seq = np.zeros((len(s)+1, len(chars_to_idx.keys())+1), dtype=np.float16)
    
    for i in range(len(s)):
        # Is s[i] a known character?
        try:
            v_seq[i,chars_to_idx[s[i]]] = 1.0
        #If not, then unknown = 1
        except KeyError:
            v_seq[i, -1] = 1.0
            
    v_seq[-1, chars_to_idx['<EOS>']] = 1.0
    return v_seq

def oneHotToString(seq, idx_to_chars):
    s = ""
    for one_hot_vec in seq:
        # Is the index in idx_to_char?
        try:
            #print np.argmax(one_hot_vec)
            s += idx_to_chars[np.argmax(one_hot_vec)]
        except KeyError:
            s += '<UNK>'
    return s

Load the data:



In [19]:

    
movieQA_folder = os.path.join('.','..','Data','MovieQA')

# Load text files into nupmy arrays;
movie_convs_txt = os.path.join(movieQA_folder, 'movie_conversations.txt')
movie_lines_txt = os.path.join(movieQA_folder, 'movie_lines.txt')

movie_convs_np = np.loadtxt(movie_convs_txt, dtype='string', delimiter=' +++$+++ ', comments=None)
movie_lines_np = np.loadtxt(movie_lines_txt, dtype='string', delimiter=' +++$+++ ', comments=None)

print "Number of conversations : %d" % len(movie_convs_np)
print "Number of lines : %d" % len(movie_lines_np)









    



Number of conversations : 83097
Number of lines : 304713

Create dictionaries of movie lines



In [20]:

    
# lineID : one_hot_sequence
line_to_one_hot = {}

# lineID : movie character ID
line_to_movie_car = {}

for line in movie_lines_np:
    line_to_one_hot[line[0]] = stringToOneHot(line[-1], chars_to_idx, lower=True)
    line_to_movie_car[line[0]] = line[1]



In [21]:

    
#Sanity check
print len(line_to_one_hot.keys())
print oneHotToString(line_to_one_hot['L205'], idx_to_chars)









    



304713
unsolved mystery.  she used to be really popular when she started high school, then it was just like she got sick of it or something.<EOS>

Create a list of Q/A pairs - AKA the dataset:



In [22]:

    
# Create a list of Q/A pairs.
#  For the simplest approach. We should be able to train a mediocre language (character-level) model with this.
#  Eventually, this dataset could be more usful for a dialogue model, since most conversations have more than 2 interactions.

qa_pairs = []
for conversation in movie_convs_np:
    subID = 0
    lines = eval(conversation[-1])
    while subID < (len(lines) - 1):
        qa_pairs.append((line_to_one_hot[lines[subID]], line_to_one_hot[lines[subID+1]]))
        subID += 1
print "Got %d Q/A pairs." % len(qa_pairs)









    



Got 221616 Q/A pairs.



In [23]:

    
#Sanity check :
idx = 7
print oneHotToString(qa_pairs[idx][0], idx_to_chars)
print oneHotToString(qa_pairs[idx][1], idx_to_chars)









    



why?<EOS>
unsolved mystery.  she used to be really popular when she started high school, then it was just like she got sick of it or something.<EOS>

Save the dataset :



In [24]:

    
qa_pairs_pkl = os.path.join(movieQA_folder, 'QA_Pairs.pkl')
with open(qa_pairs_pkl, 'wb') as f:
    cPickle.dump({"qa_data":qa_pairs}, f, protocol=cPickle.HIGHEST_PROTOCOL)

Load the pkl dataset:



In [25]:

    
# Sanity check:
with open(qa_pairs_pkl, 'rb') as f:
    data = cPickle.load(f)
qa_pairs = data["qa_data"]



In [26]:

    
n_examples = 5

for i in range(n_examples):
    idx = np.random.randint(len(qa_pairs))
    print oneHotToString(qa_pairs[idx][0], idx_to_chars)
    print oneHotToString(qa_pairs[idx][1], idx_to_chars)
    print









    



nancy, don't fall asleep in there.<EOS>
i won't.<EOS>

i don't think we're being treated fairly, lois. and i'm going to tell miss warfield.<EOS>
clark . . . wait!<EOS>

you and my father have become very close.  perhaps one day i may say the same for us.<EOS>
you flatter me, caesar.<EOS>

so, what, this zimm guy asking for some kinda finders fee, that what we're talking about here?<EOS>
hey, zimm doesn't ask for dick. zimm tells you the way it is... or else.<EOS>

do you think you could ever think of a set of circumstances that would just cause you to haul off and shoot someone?<EOS>
i could shoot your cousin eddie.<EOS>



In [ ]: